Code
# Loading in packages
library(tidyverse)
library(ggplot2)
library(gt)
library(kableExtra)# Loading in packages
library(tidyverse)
library(ggplot2)
library(gt)
library(kableExtra)# Load in data set
names <- read_csv(here::here("Week 9", "StateNames_A.csv"))DT::datatable(names)# Renaming column
names <- names |>
rename(Sex = Gender)#Source for how to make the table striped: https://apreshill.github.io/data-vis-labs-2018/05-tables.html
# Revised %in% to == because its a single value
names |>
filter(Name == "Allison") |>
group_by(State, Sex) |>
summarize(Count = sum(Count), .groups = "drop") |>
pivot_wider(names_from = Sex, values_from = Count, values_fill = 0) |>
knitr::kable(format = "html",
digits = 3,
col.names =
c("State",
"Female",
"Male"),
caption = "Number of People Named Allison by Sex and State") |>
kable_styling(font_size = 15, bootstrap_options = "striped")| State | Female | Male |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
# Revised %in% to == because its a single value
allison_f <- names |>
filter(Name == "Allison", Sex == "F")#Revised so the graph is a line graph an not a scatter plot and took out the y axis so people do not have to tilt their heads to read this graph
allison_f |>
group_by(Year) |>
summarize(Count = sum(Count), .groups = "drop") |>
ggplot(mapping = aes(x = Year, y = Count)) +
geom_line(color = "cadetblue") +
scale_x_continuous(breaks = c("1997" : "2014")) +
labs(x = "Year", y = "",
title = "Total Number of Female Allisons Born Each Year") +
theme(text = element_text(colour = "navy"))allison_f |>
group_by(Year) |>
summarize(Count = sum(Count), .groups = "drop") |>
ggplot(mapping = aes(x = Year, y = Count)) +
geom_point(color = "darkorange") +
geom_smooth(method = "lm") +
scale_x_continuous(breaks = c("1997" : "2014")) +
labs(x = "Year", y = "Count",
title = "Total Number of Female Allisons Born Each Year")allison_f_lm <- allison_f |>
group_by(Year) |>
summarize(Count = sum(Count), .groups = "drop") |>
lm(Count ~ Year, data = _)
allison_f_lm
Call:
lm(formula = Count ~ Year, data = summarize(group_by(allison_f,
Year), Count = sum(Count), .groups = "drop"))
Coefficients:
(Intercept) Year
209689.8 -101.5
The regression equation is: Yhat = 209689.8 - 101.5 * X
allison_f_lm |>
broom::augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point(color = "navy") +
labs(x = "Fitted Values", y = "Residual Values", title = "Plotted Residuals vs Fitted values")I see two patterns is the residuals, one is a “U” shape on the left and there is another grouping of residuals on the right.
From this model I have concluded that the amount of females named Allison has decreased since 1997 however there are a couple years, 2008 and 2009, where the number of Allisons increased. I would say that the popularity of the name Allison has decreased but it is still a popular name.
# Source for changing text size: https://statisticsglobe.com/change-font-size-of-ggplot2-plot-in-r-axis-text-main-title-legend
allen <- names |>
filter(Name %in% c("Allan", "Alan", "Allen"), Sex == "M")
allen |>
group_by(Year, Name) |>
summarize(Count = sum(Count), .groups = "drop") |>
ggplot(mapping = aes(x = Year, y = Count)) +
geom_line(color = "maroon") +
facet_wrap(~ Name) +
theme(text = element_text(size = 15)) allen |>
filter(Year == "2000", State %in% c("PA", "CA")) |>
pivot_wider(names_from = Name, values_from = Count, values_fill = 0) |>
select(State, Alan, Allen, Allan) |>
gt() |>
tab_header(title = "Number of Alan, Allen, or Allan by State")| Number of Alan, Allen, or Allan by State | |||
| State | Alan | Allen | Allan |
|---|---|---|---|
| CA | 579 | 176 | 131 |
| PA | 51 | 56 | 12 |
# changed the decimals so they are percentages
allen |>
filter(Year == "2000", State %in% c("PA", "CA")) |>
mutate(prop = Count / sum(Count)
) |>
select(Sex, State, Name, prop) |>
pivot_wider(names_from = Name, values_from = prop, values_fill = 0) |>
gt() |>
tab_header(title = "Percentage of Alan, Allen, or Allan by State") |>
fmt_percent(columns = 3:5, decimals = 2)| Percentage of Alan, Allen, or Allan by State | ||||
| Sex | State | Alan | Allen | Allan |
|---|---|---|---|---|
| M | CA | 57.61% | 17.51% | 13.03% |
| M | PA | 5.07% | 5.57% | 1.19% |